import org.apache.http.HttpHost;
import pipeline.CapsePipeline;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;


/**
 * Created by zhangcheng on 17/4/18.
 */
public class TestCralwer implements PageProcessor {

    static String PATH = "http://jobs.51job.com/changsha/\\d+\\.html";
    static String PATH2 = "http://jobs.51job.com/changsha-\\w+\\/\\d+\\.html?s=01&t=0";

    private Site site = Site.me()
            .setRetryTimes(3)
            .setSleepTime(1000)
            .setHttpProxy(new HttpHost("10.101.1.6", 80))
            .setTimeOut(10000);

    public void process(Page page) {
        if (page.getUrl().regex(PATH).match()||page.getUrl().regex(PATH2).match()) {
            String city = page.getHtml().xpath("/html/body/div[2]/div[2]/div[2]/div/div[1]/span/text()").get();
            String workAddress = page.getHtml().xpath("/html/body/div[2]/div[2]/div[3]/div[5]/div/p/text()").get();
            page.putField("city", city);
            page.putField("workAddress", workAddress);
            System.out.println("城市:" + city + "\t工作地址:" + workAddress);
        } else {
            page.addTargetRequests(page.getHtml().links().regex(PATH).all());
        }
    }

    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        //http://search.51job.com/list/190200,000000,0000,32%252C01,9,99,%2B,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=
        Spider spider = Spider
                .create(new TestCralwer())
                .addPipeline(new CapsePipeline("G:\\webmagic"));
        for (int i = 1; i < 100; i++) {
            spider.addUrl("http://search.51job.com/list/190200,000000,0000,32%252C01,9,99,%2B,2," + i + ".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=");
        }
        spider.thread(2).run();
    }
}
